c( # I use linux, this saves my time loading/installing packages
  "ProjectTemplate", "here", "stringr",
  "dplyr", "knitr", "ggplot2", "tabplot",
  "patchwork", "tableone", "tidyr", "visdat", "glmnet",
  "doParallel", "kableExtra", "dtplyr", "purrr",
  "kableExtra"
) |>
  lapply(function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x)
    }
  })
[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL

[[6]]
NULL

[[7]]
NULL

[[8]]
NULL

[[9]]
NULL

[[10]]
NULL

[[11]]
NULL

[[12]]
NULL

[[13]]
NULL

[[14]]
NULL

[[15]]
NULL

[[16]]
NULL

[[17]]
NULL
set.seed(42) # setting seed for stochastic functions
setwd("..") # here()) # needed as we are in /src, in linux here() should be used

load.project() # Loading the project
Project name: 2023-retention-ABCD
Loading project configuration
Autoloading helper functions
 Running helper script: globals.R
 Running helper script: helpers.R
 Running helper script: pclean.R
Autoloading data
Munging data
 Running preprocessing script: 01-A.R
pclean()
[1] "Hey! Cleaning files at src folder, look in reports for them."
# R options
options(
  digits = 3, # Only two decimal digits
  scipen = 999, # Remove scientific notation
  width = 100
)


# Knitr options
knitr::opts_chunk$set(
  comment = NA, # remove comment symbol
  cache.path = "../cache/", # where should I save cache?
  fig.path = "../graphs/", # where should I save figures?
  echo = T, # dont echo by default
  cache = F, # dont cache by default
  fig.width = 10, # setting the best witdth for figures
  fig.height = 7, # best height
  dpi = 300, # high dpi for publication quality,
  error = FALSE # do not interrupt in case of errors
)


cb_palette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7"
)

theme_luis <- function() {
  return_theme <- ggplot2::theme_bw(12) +
    ggplot2::theme(
      panel.border = element_rect(colour = "black"),
      legend.background = element_rect(linetype = 1, size = 0.2, colour = 1)
    )
}

# register parallel backend
cl <- makeCluster(detectCores()-3, outfile = "")
registerDoParallel(cl)
retention  <- read.csv("~/Downloads/dataIndiv_ID_2023-03-08.csv") %>%
  filter(event == "Baseline" | event == "1 Year") %>%
  select(pguid, event, visit_status_aggr, income_household, site, race_ethnicity, education) %>%
  mutate(missed = ifelse(visit_status_aggr == "Missed (explained)" |
                visit_status_aggr == "Missed (unexplained)", 0, 1),
  site = as.factor(site),
  race_ethnicity = factor(race_ethnicity),
  education = factor(education, ordered = T),
  income_household = factor(income_household, ordered = T)
  )

# below is a version with wide format, which is the correct way, but there is an oddity in
# the data that the values for some demographic info only appear at the follow up, not at the baseline
# so I suppose the data was joined incorrectly.

# retention_wide <- dataIndiv_ID_2023.03.08 %>%
#   select(pguid, event, visit_status_aggr, income_household, site, race_ethnicity, education) %>%
#   group_by(pguid) %>%
#   filter(event == "Baseline" | event == "1 Year") %>%
#   mutate(missed = ifelse(visit_status_aggr == "Missed (explained)" |
#                 visit_status_aggr == "Missed (unexplained)", 1, 0)) %>%
#   pivot_wider(names_from = event, values_from = c(missed, income_household, site, race_ethnicity, education)) %>%
#   mutate(`missed_1 Year` = ifelse(is.na(`missed_1 Year`), 0, `missed_1 Year`))


# retention %>% 
#   group_by(event) %>% 
#   summarise(n = n(), 
#             n_missed = sum(missed_visit_1y), 
#             perc_missed = round(n_missed/n, 2)) |>
#   kable(align = "c", caption = "Number of participants and percentage of missed visits by event") |>
#   kable_styling()
# # About 5% of participants missed their 1 year visit

Modeling

Leave-one-site-out cross validation and coeff

# Initialize empty vectors for storing results
lambda_values <- c()
RMSE_values <- c()
var_imp_avg <- NULL

# Iterate over the sites
for (isite in unique(retention$site)) {
  # Split retention into train and test
  train <- retention %>%
    filter(site != isite)  # Leave out the current site

  test <- retention %>%
    filter(site == isite)  # Use only the current site

  # Prepare dataset in glmnet format for train and test sets
  outcome_train <- data.matrix(train$missed)
  outcome_test <- data.matrix(test$missed)

  predictors_train <- train %>%
    select(income_household,  race_ethnicity, education) %>%
    data.matrix()

  predictors_test <- test %>%
    select(income_household,  race_ethnicity, education) %>%
    model.matrix(~ ., data = .)

  message("Site: ", isite)

  # Perform cross-validated glmnet model fitting
  cv_model <- cv.glmnet(predictors_train, outcome_train,
                        nfolds = 21,  # Use LOOCV
                family = "binomial",
  parallel =T)

  # Get the best lambda
  best_lambda <- cv_model$lambda.1se

  # Fit the model using the best lambda on the test set
  fit <- glmnet(predictors_test, outcome_test,
                alpha = 1,
                type.measure = "mse",
                lambda = best_lambda,
                family = "binomial",
                intercept = FALSE)

  var_imp <- broom::tidy(fit)

  # Store the results
  RMSE <- cv_model$cvm[cv_model$lambda == cv_model$lambda.1se] %>% sqrt() %>% round(2)
  lambda <- cv_model$lambda.1se %>% round(2)

  if (is.null(var_imp_avg)) {
    var_imp_avg <- var_imp |>
      mutate(site = isite) 
  } else {
    # var_imp_avg <- full_join(var_imp_avg, var_imp, by = c("class","term"))
    var_imp_avg <- var_imp |>
      mutate(site = isite) |>
      bind_rows(var_imp_avg)

    lambda_values <- lambda_values %>% append(lambda)
    RMSE_values <- RMSE_values %>% append(RMSE)
  }
}
Site: OHSU
Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one multinomial or
binomial class has fewer than 8 observations; dangerous ground
Site: UCSD
Site: YALE
Site: ROC
Site: WUSTL
Site: UMB
Site: UVM
Site: FIU
Site: UFL
Site: LIBR
Site: UMN
Site: VCU
Site: UTAH
Site: CUB
Site: MUSC
Site: CHLA
Site: UMICH
Site: UCLA
Site: SRI
Site: UPMC
Site: UWM
a <- var_imp_avg %>%
  # filter(class == 0) %>%
  group_by(term) %>%
  summarise(avg_coef = mean(estimate), 
            n = n(),
            std_error = sd(estimate) / sqrt(n)) %>%
  mutate(lower = avg_coef - 1.96 * std_error,
         upper = avg_coef + 1.96 * std_error
         ) %>%
  drop_na() # will be only keeping vars that consistently appear in all sites

Factors associated w returning to follow-up, consistently across sites

Table

a %>% 
  select(term, avg_coef, lower, upper) %>%
  # decreasing avg_coef
  arrange(desc(avg_coef)) %>%
  #mutate(Importance = round(Importance, 2)) %>% 
  kableExtra::kbl(booktabs = T,
                  col.names = c("Factor", "mean coeff.", "95% CI - lower", "95% CI - upper"),
                  escape = TRUE) %>%
  kable_styling(latex_options = c("striped", "hold_position", "repeat_header"),
  full_width = F)
Factor mean coeff. 95% CI - lower 95% CI - upper
education.L 3.963 2.766 5.160
race_ethnicityWhite 2.573 2.234 2.913
race_ethnicityHispanic 2.099 1.801 2.396
race_ethnicityOther 1.786 1.524 2.048
race_ethnicityBlack 1.775 1.437 2.113
education^10 1.469 0.974 1.964
education^9 1.071 0.817 1.325
race_ethnicityUnknown 0.885 0.684 1.085
education^17 0.597 -0.451 1.644
education^5 0.568 0.010 1.126
education^11 0.242 -0.160 0.644
education^14 -0.246 -0.623 0.132
income_household.Q -0.371 -0.672 -0.071
education^19 -0.381 -0.834 0.072
education^20 -0.479 -1.232 0.274
income_household.L -0.511 -0.858 -0.163
education^21 -0.787 -1.402 -0.171
education^13 -0.827 -1.219 -0.435
education.C -1.602 -2.132 -1.072

Point-in-range plot - predicting retention

# generate a point-in-range plot similar to the table above
a %>%
  ungroup() %>%
  # arrange by avg_coef
  ggplot(aes(x = term, y = avg_coef)) +
  geom_point() +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip() +
  labs(x = "Factor", y = "Mean coefficient", title = "Point-in-range plot of the mean coefficient of each factor") +
  theme_bw() 

GLMnet model performance

The lambda space

plot(cv_model)

Plot last fit

It does not work too well in the Leave-one-site-out cv, figure out how to provide this plot in this case.

plot(fit, xvar = "dev", label = TRUE)

The lambda space

Looks weird in the binomial case, ok in the multinomial. Why? Results are identical.

plot(cv_model)

Plot last fit

plot(fit, xvar = "dev", label = TRUE)

Basic descriptive characteristics

Missingness

Tabplot

library(tabplot)
tableplot(retention)

Vizmiss

library(visdat)
vis_miss(retention)

Distribution of age by sex

Category names?

dataIndiv_ID_2023.03.08 %>%
  filter(kbi_gender %in% c(1, 2, 3)) %>%
  ggplot(aes(as.numeric(interview_age), fill = as.factor(kbi_gender))) +
  geom_density(alpha = .3) +
  labs(
    title = "Age Male x Female", x = "Age",
    y = "Density"
  ) +
  theme_luis()
Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2
3.4.0.
ℹ Please use the `linewidth` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning
was generated.
Warning: Removed 12727 rows containing non-finite values (`stat_density()`).

System information

project.info
$config
$config$version
[1] "0.10.2"

$config$data_loading
[1] TRUE

$config$data_loading_header
[1] TRUE

$config$data_ignore
[1] ""

$config$cache_loading
[1] TRUE

$config$recursive_loading
[1] FALSE

$config$munging
[1] TRUE

$config$logging
[1] FALSE

$config$logging_level
[1] "INFO"

$config$load_libraries
[1] FALSE

$config$libraries
[1] "reshape2, plyr, tidyverse, stringr, lubridate"

$config$as_factors
[1] FALSE

$config$tables_type
[1] "tibble"

$config$attach_internal_libraries
[1] FALSE

$config$cache_loaded_data
[1] TRUE

$config$sticky_variables
[1] "NONE"

$config$underscore_variables
[1] TRUE

$config$cache_file_format
[1] "RData"


$helpers
[1] "globals.R" "helpers.R" "pclean.R" 
sessionInfo()
R version 4.2.2 (2022-10-31)
Platform: x86_64-solus-linux-gnu (64-bit)
Running under: Solus 4.3 Fortitude

Matrix products: default
BLAS:   /usr/lib64/haswell/libopenblas_haswellp-r0.3.20.so
LAPACK: /usr/lib64/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=pt_BR.UTF-8       
 [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
[10] LC_TELEPHONE=C             LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] rmarkdown_2.19         purrr_1.0.1            dtplyr_1.2.2           kableExtra_1.3.4      
 [5] doParallel_1.0.17      iterators_1.0.14       foreach_1.5.2          glmnet_4.1-6          
 [9] Matrix_1.5-1           visdat_0.6.0           tidyr_1.3.0            tableone_0.13.2       
[13] patchwork_1.1.2        tabplot_1.4.1          ffbase_0.13.2          ff_4.0.9              
[17] bit_4.0.5              ggplot2_3.4.0          knitr_1.41             dplyr_1.1.2           
[21] stringr_1.5.0          here_1.0.1             ProjectTemplate_0.10.2 tibble_3.2.1          
[25] digest_0.6.31          nvimcom_0.9-131       

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.9        svglite_2.1.1     lattice_0.20-45   rprojroot_2.0.3   utf8_1.2.2       
 [6] R6_2.5.1          backports_1.4.1   survey_4.1-1      evaluate_0.19     highr_0.10       
[11] httr_1.4.4        pillar_1.9.0      rlang_1.1.0       rstudioapi_0.14   data.table_1.14.6
[16] jquerylib_0.1.4   labeling_0.4.2    splines_4.2.2     webshot_0.5.4     munsell_0.5.0    
[21] broom_1.0.2       compiler_4.2.2    xfun_0.39         pkgconfig_2.0.3   systemfonts_1.0.4
[26] shape_1.4.6       htmltools_0.5.4   mitools_2.4       tidyselect_1.2.0  codetools_0.2-18 
[31] fansi_1.0.3       viridisLite_0.4.1 crayon_1.5.2      withr_2.5.0       grid_4.2.2       
[36] jsonlite_1.8.4    gtable_0.3.1      lifecycle_1.0.3   DBI_1.1.3         magrittr_2.0.3   
[41] scales_1.2.1      cachem_1.0.6      cli_3.6.0         stringi_1.7.12    farver_2.1.1     
[46] bslib_0.4.2       xml2_1.3.3        generics_0.1.3    vctrs_0.6.1       fastmatch_1.1-3  
[51] tools_4.2.2       glue_1.6.2        yaml_2.3.6        fastmap_1.1.0     survival_3.4-0   
[56] colorspace_2.0-3  rvest_1.0.3       sass_0.4.6       

References